Contents

%run set_theme.ipynb
import pandas as pd
import plotly.express as px
from plotly.offline import init_notebook_mode

init_notebook_mode()
df = pd.read_parquet('../data/SO_2014_2022.pq')
df = df[(df['Salary'] > 0) & (df['Salary'] < 250000)]

df.head()
Year Salary JobSat YearsCode YearsCodePro Age Education OrgSize LastNewJob Employment RespondentType JobSeek Gender Student Country CodingActivities DevType LearnCodeFrom LangPresent
0 2022 69318.0 <NA> 10 5 25-34 master 500 to 999 employees <NA> fulltime dev <NA> male no Germany School or academic work Data scientist or machine learning specialist;... Books / Physical media;School (i.e., Universit... C;C++;Java;JavaScript;MATLAB;Python;Scala;SQL;...
6 2022 27652.0 <NA> 18 10 25-34 bachelor 1,000 to 4,999 employees <NA> fulltime dev <NA> male no Colombia Hobby Developer, full-stack;Developer, back-end Books / Physical media;Other online resources ... Bash/Shell/PowerShell;Elixir;HTML/CSS;JavaScri...
9 2022 15431.0 <NA> 5 5 25-34 bachelor 20 to 99 employees <NA> fulltime dev <NA> male no Ghana Freelance/contract work Developer, back-end On the job training;Coding Bootcamp JavaScript;Ruby
13 2022 47352.0 <NA> 7 7 45-54 master 10 to 19 employees <NA> fulltime non-dev <NA> male no Belgium Hobby Developer, back-end;Educator or academic;Datab... Books / Physical media;On the job training;Col... Delphi;SQL
22 2022 78084.0 <NA> 25 25 45-54 bachelor 500 to 999 employees <NA> fulltime non-dev <NA> male no Canada Hobby;Contribute to open-source projects Engineer, site reliability;Security professional Books / Physical media;Other online resources ... Bash/Shell/PowerShell;C;JavaScript;Perl;PHP;Py...
# calculate mean for each of the age bins
# plot these means for all age bins

male_age_salary_df = df.query('Gender == "male"') \
    .groupby(['Gender', 'Age']) \
    .agg({'Salary': 'mean'}) \
    .reset_index()

female_age_salary_df = df.query('Gender == "female"') \
    .groupby(['Gender', 'Age']) \
    .agg({'Salary': 'mean'}) \
    .reset_index()

age_salary_df = pd.concat([male_age_salary_df, female_age_salary_df])

fig = px.bar(
    age_salary_df,
    y='Age',
    x='Salary',
    title='Mean Salary<br><sup>Salary increases as people are getting older</sup>',
    orientation='h',
    barmode='group',
    color='Gender',
    color_discrete_map={
        'male': '#5b6fec',
        'female': '#f854ee'
    },
    width=790,
)

fig.for_each_trace(lambda t: t.update(hovertemplate='<b>' + t.name.capitalize() + '</b><br>Average salary: %{x:d}<extra></extra>',
                                      hoverlabel={'font_color': 'white', 'bordercolor': 'white'}))
fig.update_layout(
    margin={'l': 110, 'b': 120, 'r': 130, 't': 100},
)

fig.add_annotation(x=-0.13, y=-0.32,
                   xref="paper", yref="paper",
                   showarrow=False,
                   align='left',
                   xanchor='left', yanchor='bottom',
                   text='Mean salary per age segment for each of the two genders.<br>' + \
                        'Hover over any bar to view the average salary for a specific age segment.')

fig.show()
C:\Users\Efe\AppData\Local\Temp\ipykernel_14132\3702917771.py:5: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
C:\Users\Efe\AppData\Local\Temp\ipykernel_14132\3702917771.py:10: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.